Análise descritiva¶
Projeto PI-4
In [1]:
import pandas as pd
import geopandas as gpd
import plotly.express as px
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
2024-11-17 23:40:49.697671: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`. 2024-11-17 23:40:49.698271: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used. 2024-11-17 23:40:49.700420: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used. 2024-11-17 23:40:49.706678: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered 2024-11-17 23:40:49.717067: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered 2024-11-17 23:40:49.720124: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered 2024-11-17 23:40:49.727968: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations. To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags. 2024-11-17 23:40:50.257268: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT
In [2]:
df=pd.read_csv("br-capes-btd-2021-2023-10-31.csv", encoding='ISO-8859-1', delimiter=';')
In [3]:
df_columns= pd.DataFrame(df.columns, columns=['Estados'])
In [4]:
df['DT_TITULACAO'].head(1)
Out[4]:
0 30JUN2021:00:00:00 Name: DT_TITULACAO, dtype: object
In [5]:
df_columns
Out[5]:
| Estados | |
|---|---|
| 0 | AN_BASE |
| 1 | CD_PROGRAMA |
| 2 | NM_PROGRAMA |
| 3 | CD_ENTIDADE_CAPES |
| 4 | SG_ENTIDADE_ENSINO |
| 5 | NM_ENTIDADE_ENSINO |
| 6 | ID_ADD_PRODUCAO_INTELECTUAL |
| 7 | ID_PRODUCAO_INTELECTUAL |
| 8 | NM_PRODUCAO |
| 9 | ID_SUBTIPO_PRODUCAO |
| 10 | NM_SUBTIPO_PRODUCAO |
| 11 | ID_AREA_CONCENTRACAO |
| 12 | NM_AREA_CONCENTRACAO |
| 13 | ID_LINHA_PESQUISA |
| 14 | NM_LINHA_PESQUISA |
| 15 | ID_PROJETO |
| 16 | NM_PROJETO |
| 17 | DH_INICIO_AREA_CONC |
| 18 | DH_FIM_AREA_CONC |
| 19 | DH_INICIO_LINHA |
| 20 | DH_FIM_LINHA |
| 21 | DT_TITULACAO |
| 22 | DS_PALAVRA_CHAVE |
| 23 | DS_ABSTRACT |
| 24 | DS_KEYWORD |
| 25 | IN_TRABALHO_MESMA_AREA |
| 26 | NM_TP_VINCULO |
| 27 | IN_ORIENT_PARTICIPOU_BANCA |
| 28 | DS_BIBLIOTECA_DEPOSITARIA |
| 29 | ID_TP_EXPECTATIVA_ATUACAO |
| 30 | NM_EXPECTATIVA_ATUACAO |
| 31 | ID_PESSOA_DISCENTE |
| 32 | NM_DISCENTE |
| 33 | DT_MATRICULA |
| 34 | ID_GRAU_ACADEMICO |
| 35 | NM_GRAU_ACADEMICO |
| 36 | NM_ORIENTADOR |
| 37 | DS_CATEGORIA_ORIENTADOR |
| 38 | NM_CATEGORIA_DOCENTE |
| 39 | NM_REGIAO |
| 40 | SG_UF_IES |
| 41 | NM_UF_IES |
| 42 | CD_GRANDE_AREA_CONHECIMENTO |
| 43 | NM_GRANDE_AREA_CONHECIMENTO |
| 44 | CD_AREA_CONHECIMENTO |
| 45 | NM_AREA_CONHECIMENTO |
| 46 | CD_SUBAREA_CONHECIMENTO |
| 47 | NM_SUBAREA_CONHECIMENTO |
| 48 | CD_ESPECIALIDADE |
| 49 | NM_ESPECIALIDADE |
| 50 | NM_AREA_AVALIACAO |
| 51 | NR_VOLUME |
| 52 | NR_PAGINAS |
| 53 | NM_IDIOMA |
| 54 | DS_RESUMO |
| 55 | DS_URL_TEXTO_COMPLETO |
| 56 | ID_PESSOA_ORIENTADOR |
| 57 | IN_TCC_COM_VINCULO_PRODUCAO |
| 58 | ID_ADD_PRODUCAO_VINCULO_CT |
- Medidas tendencia central
- Boxplot
- Histograma
- Curtose, assimetria
- Variação intraclasse -> determinar correlação
- Variação interclasse -> determinar correlação
- Coeficiente de crammer -> variaveis categoricas
Grafico
Grafico de linhas
- Numero de trabalhos publicados ao longo do tempo, cresce ou aumenta?
Grafico de barras
- comparar as classes de trabalhos - áreas
Grafico de bolhas - geográfico
Mapas de calor para verificar quais as dias do ano mais pessoas defendem ao longo dos anos
Tree map alternativa ao grafico de pizza
Grafico de barras um do lado da outra para comparar defesa de mulheres e homens (2021,2022,2023)
dados cíclicos - grafico radar
Mapa cloroplético -
correlações
INVESTIMENTO FEDERAL em educação vs numero de pos graduandos
Trabalhos por estado¶
In [6]:
df_grouped=pd.DataFrame(df.groupby('NM_UF_IES').size())
df_grouped = df_grouped.reset_index(names='Count')
df_grouped.columns = ['Estados', 'Count']
In [7]:
fig = px.bar(df_grouped.sort_values(by='Count', ascending=False), x='Estados', y='Count', title='Distribuição de trabalhos por estado')
fig.update_layout(yaxis_title='Count', xaxis_title='Estado')
fig.show()
In [8]:
shapefile_path = "BR_UF_2022/BR_UF_2022.shp"
gdf = gpd.read_file(shapefile_path)
gdf['centroid'] = gdf.geometry.centroid
gdf.head(30)
/tmp/ipykernel_2871610/4091591602.py:3: UserWarning: Geometry is in a geographic CRS. Results from 'centroid' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.
Out[8]:
| CD_UF | NM_UF | SIGLA_UF | NM_REGIAO | AREA_KM2 | geometry | centroid | |
|---|---|---|---|---|---|---|---|
| 0 | 12 | Acre | AC | Norte | 164173.429 | POLYGON ((-68.79282 -10.99957, -68.79367 -10.9... | POINT (-70.47293 -9.21327) |
| 1 | 13 | Amazonas | AM | Norte | 1559255.881 | POLYGON ((-56.76292 -3.23221, -56.76789 -3.242... | POINT (-64.65345 -4.15411) |
| 2 | 15 | Pará | PA | Norte | 1245870.704 | MULTIPOLYGON (((-48.97548 -0.19834, -48.97487 ... | POINT (-53.07149 -3.98042) |
| 3 | 16 | Amapá | AP | Norte | 142470.762 | MULTIPOLYGON (((-51.04561 -0.05088, -51.05422 ... | POINT (-51.96202 1.44746) |
| 4 | 17 | Tocantins | TO | Norte | 277423.627 | POLYGON ((-48.2483 -13.19239, -48.24844 -13.19... | POINT (-48.3313 -10.14808) |
| 5 | 21 | Maranhão | MA | Nordeste\n | 329651.496 | MULTIPOLYGON (((-44.5868 -2.23341, -44.58696 -... | POINT (-45.28777 -5.07221) |
| 6 | 22 | Piauí | PI | Nordeste\n | 251755.481 | POLYGON ((-42.47034 -3.48377, -42.46126 -3.484... | POINT (-42.97045 -7.3893) |
| 7 | 23 | Ceará | CE | Nordeste\n | 148894.447 | POLYGON ((-37.87162 -4.3664, -37.87109 -4.3670... | POINT (-39.61579 -5.09322) |
| 8 | 24 | Rio Grande do Norte | RN | Nordeste\n | 52809.599 | MULTIPOLYGON (((-35.18728 -5.78987, -35.18707 ... | POINT (-36.67327 -5.8398) |
| 9 | 25 | Paraíba | PB | Nordeste\n | 56467.242 | MULTIPOLYGON (((-34.7958 -7.175, -34.79578 -7.... | POINT (-36.83246 -7.12104) |
| 10 | 26 | Pernambuco | PE | Nordeste\n | 98067.877 | MULTIPOLYGON (((-35.04823 -8.60936, -35.04756 ... | POINT (-37.99768 -8.32522) |
| 11 | 27 | Alagoas | AL | Nordeste\n | 27830.661 | MULTIPOLYGON (((-35.287 -9.14489, -35.28699 -9... | POINT (-36.62485 -9.51367) |
| 12 | 28 | Sergipe | SE | Nordeste\n | 21938.188 | MULTIPOLYGON (((-37.01203 -10.92784, -37.01267... | POINT (-37.44379 -10.58376) |
| 13 | 29 | Bahia | BA | Nordeste\n | 564760.429 | MULTIPOLYGON (((-39.26447 -8.61413, -39.26341 ... | POINT (-41.72116 -12.47533) |
| 14 | 31 | Minas Gerais | MG | Sudeste\n | 586513.983 | POLYGON ((-42.51148 -14.98627, -42.50964 -14.9... | POINT (-44.67336 -18.45618) |
| 15 | 32 | Espírito Santo | ES | Sudeste\n | 46074.448 | MULTIPOLYGON (((-40.27883 -20.33437, -40.27883... | POINT (-40.66851 -19.57518) |
| 16 | 33 | Rio de Janeiro | RJ | Sudeste\n | 43750.425 | MULTIPOLYGON (((-42.00612 -22.88563, -42.00634... | POINT (-42.66278 -22.19572) |
| 17 | 35 | São Paulo | SP | Sudeste\n | 248219.485 | MULTIPOLYGON (((-46.47312 -22.70498, -46.47289... | POINT (-48.72896 -22.26584) |
| 18 | 41 | Paraná | PR | Sul\n | 199298.981 | MULTIPOLYGON (((-48.30974 -25.49328, -48.27691... | POINT (-51.61664 -24.63588) |
| 19 | 42 | Santa Catarina | SC | Sul\n | 95730.690 | MULTIPOLYGON (((-49.23653 -26.03711, -49.2365 ... | POINT (-50.47471 -27.2474) |
| 20 | 43 | Rio Grande do Sul | RS | Sul\n | 281707.151 | MULTIPOLYGON (((-51.71873 -31.85463, -51.71941... | POINT (-53.24515 -29.78646) |
| 21 | 50 | Mato Grosso do Sul | MS | Centro-oeste\n | 357142.082 | POLYGON ((-54.68379 -23.8305, -54.68569 -23.83... | POINT (-54.84556 -20.32733) |
| 22 | 51 | Mato Grosso | MT | Centro-oeste\n | 903208.361 | POLYGON ((-56.0716 -17.17062, -56.07246 -17.17... | POINT (-55.91228 -12.94898) |
| 23 | 52 | Goiás | GO | Centro-oeste\n | 340242.859 | POLYGON ((-47.33502 -15.58733, -47.33512 -15.5... | POINT (-49.62251 -16.04119) |
| 24 | 53 | Distrito Federal | DF | Centro-oeste\n | 5760.784 | POLYGON ((-48.01472 -16.04996, -48.01573 -16.0... | POINT (-47.79685 -15.78117) |
| 25 | 11 | Rondônia | RO | Norte | 237754.172 | POLYGON ((-62.60021 -13.01675, -62.59999 -13.0... | POINT (-62.84196 -10.91314) |
| 26 | 14 | Roraima | RR | Norte | 223644.530 | POLYGON ((-60.12972 4.50843, -60.1296 4.50826,... | POINT (-61.39191 2.08271) |
In [9]:
df_sg=pd.DataFrame(df.groupby('SG_UF_IES').size())
df_sg = df_sg.reset_index(names='Count')
df_sg.columns = ['SIGLA_UF', 'Count']
df_sg.head()
Out[9]:
| SIGLA_UF | Count | |
|---|---|---|
| 0 | AC | 215 |
| 1 | AL | 676 |
| 2 | AM | 860 |
| 3 | AP | 139 |
| 4 | BA | 3045 |
In [10]:
gdf_joined = gdf.merge(df_sg, how="left", on="SIGLA_UF")
gdf.head(30)
Out[10]:
| CD_UF | NM_UF | SIGLA_UF | NM_REGIAO | AREA_KM2 | geometry | centroid | |
|---|---|---|---|---|---|---|---|
| 0 | 12 | Acre | AC | Norte | 164173.429 | POLYGON ((-68.79282 -10.99957, -68.79367 -10.9... | POINT (-70.47293 -9.21327) |
| 1 | 13 | Amazonas | AM | Norte | 1559255.881 | POLYGON ((-56.76292 -3.23221, -56.76789 -3.242... | POINT (-64.65345 -4.15411) |
| 2 | 15 | Pará | PA | Norte | 1245870.704 | MULTIPOLYGON (((-48.97548 -0.19834, -48.97487 ... | POINT (-53.07149 -3.98042) |
| 3 | 16 | Amapá | AP | Norte | 142470.762 | MULTIPOLYGON (((-51.04561 -0.05088, -51.05422 ... | POINT (-51.96202 1.44746) |
| 4 | 17 | Tocantins | TO | Norte | 277423.627 | POLYGON ((-48.2483 -13.19239, -48.24844 -13.19... | POINT (-48.3313 -10.14808) |
| 5 | 21 | Maranhão | MA | Nordeste\n | 329651.496 | MULTIPOLYGON (((-44.5868 -2.23341, -44.58696 -... | POINT (-45.28777 -5.07221) |
| 6 | 22 | Piauí | PI | Nordeste\n | 251755.481 | POLYGON ((-42.47034 -3.48377, -42.46126 -3.484... | POINT (-42.97045 -7.3893) |
| 7 | 23 | Ceará | CE | Nordeste\n | 148894.447 | POLYGON ((-37.87162 -4.3664, -37.87109 -4.3670... | POINT (-39.61579 -5.09322) |
| 8 | 24 | Rio Grande do Norte | RN | Nordeste\n | 52809.599 | MULTIPOLYGON (((-35.18728 -5.78987, -35.18707 ... | POINT (-36.67327 -5.8398) |
| 9 | 25 | Paraíba | PB | Nordeste\n | 56467.242 | MULTIPOLYGON (((-34.7958 -7.175, -34.79578 -7.... | POINT (-36.83246 -7.12104) |
| 10 | 26 | Pernambuco | PE | Nordeste\n | 98067.877 | MULTIPOLYGON (((-35.04823 -8.60936, -35.04756 ... | POINT (-37.99768 -8.32522) |
| 11 | 27 | Alagoas | AL | Nordeste\n | 27830.661 | MULTIPOLYGON (((-35.287 -9.14489, -35.28699 -9... | POINT (-36.62485 -9.51367) |
| 12 | 28 | Sergipe | SE | Nordeste\n | 21938.188 | MULTIPOLYGON (((-37.01203 -10.92784, -37.01267... | POINT (-37.44379 -10.58376) |
| 13 | 29 | Bahia | BA | Nordeste\n | 564760.429 | MULTIPOLYGON (((-39.26447 -8.61413, -39.26341 ... | POINT (-41.72116 -12.47533) |
| 14 | 31 | Minas Gerais | MG | Sudeste\n | 586513.983 | POLYGON ((-42.51148 -14.98627, -42.50964 -14.9... | POINT (-44.67336 -18.45618) |
| 15 | 32 | Espírito Santo | ES | Sudeste\n | 46074.448 | MULTIPOLYGON (((-40.27883 -20.33437, -40.27883... | POINT (-40.66851 -19.57518) |
| 16 | 33 | Rio de Janeiro | RJ | Sudeste\n | 43750.425 | MULTIPOLYGON (((-42.00612 -22.88563, -42.00634... | POINT (-42.66278 -22.19572) |
| 17 | 35 | São Paulo | SP | Sudeste\n | 248219.485 | MULTIPOLYGON (((-46.47312 -22.70498, -46.47289... | POINT (-48.72896 -22.26584) |
| 18 | 41 | Paraná | PR | Sul\n | 199298.981 | MULTIPOLYGON (((-48.30974 -25.49328, -48.27691... | POINT (-51.61664 -24.63588) |
| 19 | 42 | Santa Catarina | SC | Sul\n | 95730.690 | MULTIPOLYGON (((-49.23653 -26.03711, -49.2365 ... | POINT (-50.47471 -27.2474) |
| 20 | 43 | Rio Grande do Sul | RS | Sul\n | 281707.151 | MULTIPOLYGON (((-51.71873 -31.85463, -51.71941... | POINT (-53.24515 -29.78646) |
| 21 | 50 | Mato Grosso do Sul | MS | Centro-oeste\n | 357142.082 | POLYGON ((-54.68379 -23.8305, -54.68569 -23.83... | POINT (-54.84556 -20.32733) |
| 22 | 51 | Mato Grosso | MT | Centro-oeste\n | 903208.361 | POLYGON ((-56.0716 -17.17062, -56.07246 -17.17... | POINT (-55.91228 -12.94898) |
| 23 | 52 | Goiás | GO | Centro-oeste\n | 340242.859 | POLYGON ((-47.33502 -15.58733, -47.33512 -15.5... | POINT (-49.62251 -16.04119) |
| 24 | 53 | Distrito Federal | DF | Centro-oeste\n | 5760.784 | POLYGON ((-48.01472 -16.04996, -48.01573 -16.0... | POINT (-47.79685 -15.78117) |
| 25 | 11 | Rondônia | RO | Norte | 237754.172 | POLYGON ((-62.60021 -13.01675, -62.59999 -13.0... | POINT (-62.84196 -10.91314) |
| 26 | 14 | Roraima | RR | Norte | 223644.530 | POLYGON ((-60.12972 4.50843, -60.1296 4.50826,... | POINT (-61.39191 2.08271) |
In [11]:
fig, ax = plt.subplots(1, 1, figsize=(15, 12))
gdf_joined.plot(column="Count", cmap='OrRd', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)
for idx, row in gdf.iterrows():
ax.text(row["centroid"].x, row["centroid"].y, row["SIGLA_UF"], fontsize=8, ha='center')
plt.show()
In [12]:
bar_height=20
margin_height=40
chart_height=(bar_height*len(df_grouped['Estados'])) + margin_height
df_grouped_sorted = df_grouped.sort_values(by='Count')
fig = go.Figure(go.Bar(
x= df_grouped_sorted['Count'],
y=df_grouped_sorted['Estados'],
orientation='h'
))
fig.update_layout(
title='Distribuição de trabalhos por estado',
xaxis_title='Estado',
yaxis_title='Número de trabalhos',
height= chart_height,
margin=dict(t=50,b=50)
)
fig.show()
Distribuição por área de conhecimento¶
In [13]:
df_grouped_areas_con=df.groupby('NM_GRANDE_AREA_CONHECIMENTO').size()
In [14]:
df_grouped_areas_con.head()
Out[14]:
NM_GRANDE_AREA_CONHECIMENTO CIÊNCIAS AGRÁRIAS 7139 CIÊNCIAS BIOLÓGICAS 4335 CIÊNCIAS DA SAÚDE 12563 CIÊNCIAS EXATAS E DA TERRA 6442 CIÊNCIAS HUMANAS 14011 dtype: int64
In [15]:
df_grouped_areas_con= df_grouped_areas_con.reset_index()
df_grouped_areas_con.columns= ['Area_conhecimento', 'Count']
In [16]:
df_grouped_areas_con.head()
Out[16]:
| Area_conhecimento | Count | |
|---|---|---|
| 0 | CIÊNCIAS AGRÁRIAS | 7139 |
| 1 | CIÊNCIAS BIOLÓGICAS | 4335 |
| 2 | CIÊNCIAS DA SAÚDE | 12563 |
| 3 | CIÊNCIAS EXATAS E DA TERRA | 6442 |
| 4 | CIÊNCIAS HUMANAS | 14011 |
In [ ]:
fig = px.bar(df_grouped_areas_con.sort_values(by='Count', ascending=False), x='Area_conhecimento', y='Count', title='Histograma Vertical')
fig.update_layout(yaxis_title='Count', xaxis_title='Area_conhecimento')
fig.show()
In [ ]:
In [18]:
colors = px.colors.qualitative.Pastel
fig2 = go.Figure(data=[go.Pie(labels=df_grouped_areas_con['Area_conhecimento'], values=df_grouped_areas_con['Count'], hole=0.4, marker=dict(colors=colors))])
fig2.show()
In [ ]:
Distribuição temporal¶
In [19]:
from datetime import datetime
In [20]:
df['DT_TITULACAO'] = df['DT_TITULACAO'].apply(lambda x: datetime.strptime(x,'%d%b%Y:%H:%M:%S'))
In [21]:
df['DT_TITULACAO'] = df['DT_TITULACAO'].apply(lambda x: datetime.strftime(x,'%d%m%Y'))
In [22]:
df.groupby('DT_TITULACAO')
Out[22]:
<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7653c0250450>
Processamento de Linguagem Natural¶
In [23]:
import nltk
from nltk.tokenize import word_tokenize
from pathlib import Path
resumo_df = df[["DS_RESUMO","NM_GRANDE_AREA_CONHECIMENTO"]]
resumo_df.columns = ["resumo", "area"]
resumo_df['tokens'] = resumo_df["resumo"].apply(lambda x: word_tokenize(x))
resumo_df.head()
--------------------------------------------------------------------------- ModuleNotFoundError Traceback (most recent call last) Cell In[23], line 1 ----> 1 import nltk 2 from nltk.tokenize import word_tokenize 3 from pathlib import Path ModuleNotFoundError: No module named 'nltk'
In [58]:
stopwords = {word.strip() for word in set(Path("stopwords.txt").read_text().split("\n"))}
stopwords
Out[58]:
{'a',
'ao',
'aos',
'aquela',
'aquelas',
'aquele',
'aqueles',
'aquilo',
'as',
'até',
'com',
'como',
'da',
'das',
'de',
'dela',
'delas',
'dele',
'deles',
'depois',
'do',
'dos',
'e',
'ela',
'elas',
'ele',
'eles',
'em',
'entre',
'era',
'eram',
'essa',
'essas',
'esse',
'esses',
'esta',
'estamos',
'estas',
'estava',
'estavam',
'este',
'esteja',
'estejam',
'estejamos',
'estes',
'esteve',
'estive',
'estivemos',
'estiver',
'estivera',
'estiveram',
'estiverem',
'estivermos',
'estivesse',
'estivessem',
'estivéramos',
'estivéssemos',
'estou',
'está',
'estávamos',
'estão',
'eu',
'foi',
'fomos',
'for',
'fora',
'foram',
'forem',
'formos',
'fosse',
'fossem',
'fui',
'fôramos',
'fôssemos',
'haja',
'hajam',
'hajamos',
'havemos',
'havia',
'hei',
'houve',
'houvemos',
'houver',
'houvera',
'houveram',
'houverei',
'houverem',
'houveremos',
'houveria',
'houveriam',
'houvermos',
'houverá',
'houverão',
'houveríamos',
'houvesse',
'houvessem',
'houvéramos',
'houvéssemos',
'há',
'hão',
'isso',
'isto',
'já',
'lhe',
'lhes',
'mais',
'mas',
'me',
'mesmo',
'meu',
'meus',
'minha',
'minhas',
'muito',
'na',
'nas',
'nem',
'no',
'nos',
'nossa',
'nossas',
'nosso',
'nossos',
'num',
'numa',
'não',
'nós',
'o',
'os',
'ou',
'para',
'pela',
'pelas',
'pelo',
'pelos',
'por',
'qual',
'quando',
'que',
'quem',
'se',
'seja',
'sejam',
'sejamos',
'sem',
'ser',
'serei',
'seremos',
'seria',
'seriam',
'será',
'serão',
'seríamos',
'seu',
'seus',
'somos',
'sou',
'sua',
'suas',
'são',
'só',
'também',
'te',
'tem',
'temos',
'tenha',
'tenham',
'tenhamos',
'tenho',
'ter',
'terei',
'teremos',
'teria',
'teriam',
'terá',
'terão',
'teríamos',
'teu',
'teus',
'teve',
'tinha',
'tinham',
'tive',
'tivemos',
'tiver',
'tivera',
'tiveram',
'tiverem',
'tivermos',
'tivesse',
'tivessem',
'tivéramos',
'tivéssemos',
'tu',
'tua',
'tuas',
'tém',
'têm',
'tínhamos',
'um',
'uma',
'você',
'vocês',
'vos',
'à',
'às',
'é',
'éramos'}
In [59]:
resumo_df["filtered"] = resumo_df["tokens"].apply(lambda x: " ".join([word for word in x if word.lower() not in stopwords]))
resumo_df.head()
Out[59]:
| resumo | area | tokens | filtered | |
|---|---|---|---|---|
| 0 | O TERRIT�RIO AMAZ�NICO � RECONHECIDO PELA SUA ... | MULTIDISCIPLINAR | [O, TERRIT�RIO, AMAZ�NICO, �, RECONHECIDO, PEL... | TERRIT�RIO AMAZ�NICO � RECONHECIDO GRANDE BIOD... |
| 1 | A RELA��O ENTRE O HOMEM E AS PLANTAS FOI ESTAB... | MULTIDISCIPLINAR | [A, RELA��O, ENTRE, O, HOMEM, E, AS, PLANTAS, ... | RELA��O HOMEM PLANTAS ESTABELECIDA DESDE PRIM�... |
| 2 | A UTILIZA��O DE MICRO-ORGANISMOS ENDOF�TICOS C... | MULTIDISCIPLINAR | [A, UTILIZA��O, DE, MICRO-ORGANISMOS, ENDOF�TI... | UTILIZA��O MICRO-ORGANISMOS ENDOF�TICOS FONTE ... |
| 3 | OS FUNGOS FILAMENTOSOS S�O CONSIDERADOS BOAS F... | MULTIDISCIPLINAR | [OS, FUNGOS, FILAMENTOSOS, S�O, CONSIDERADOS, ... | FUNGOS FILAMENTOSOS S�O CONSIDERADOS BOAS FONT... |
| 4 | A MAL�RIA � UMA DAS DOEN�AS MAIS FATAIS QUE AF... | MULTIDISCIPLINAR | [A, MAL�RIA, �, UMA, DAS, DOEN�AS, MAIS, FATAI... | MAL�RIA � DOEN�AS FATAIS AFETA HUMANIDADE . DU... |
In [61]:
resumo_df['area'].unique()
Out[61]:
array(['MULTIDISCIPLINAR'], dtype=object)
In [60]:
import wordcloud
output = Path("results")
output.mkdir(parents=True, exist_ok=True)
grouped_area = resumo_df.groupby("area")["filtered"].apply(' '.join).reset_index()
for index, row in grouped_area.iterrows():
filename = output / f"{row['area']}.png"
word = wordcloud.WordCloud(width=800, height=400, background_color="white", stopwords=stopwords).generate(row['filtered'])
word.to_file(filename)
In [ ]:
Usando berts para categorizar¶
In [101]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from official.nlp import optimization
from sklearn.model_selection import train_test_split
import shutil
import random
In [27]:
df.groupby(['NM_GRANDE_AREA_CONHECIMENTO']).size()
Out[27]:
NM_GRANDE_AREA_CONHECIMENTO CIÊNCIAS AGRÁRIAS 7139 CIÊNCIAS BIOLÓGICAS 4335 CIÊNCIAS DA SAÚDE 12563 CIÊNCIAS EXATAS E DA TERRA 6442 CIÊNCIAS HUMANAS 14011 CIÊNCIAS SOCIAIS APLICADAS 12377 ENGENHARIAS 7580 LINGÜÍSTICA, LETRAS E ARTES 5380 MULTIDISCIPLINAR 12411 dtype: int64
In [28]:
data = df[['DS_ABSTRACT','CD_GRANDE_AREA_CONHECIMENTO' ] ]
In [29]:
data = data.rename(columns={'DS_ABSTRACT': 'X', 'CD_GRANDE_AREA_CONHECIMENTO': 'Y'})
In [30]:
data['X'].head()
Out[30]:
0 THE AMAZON TERRITORY IS RECOGNIZED FOR ITS GRE... 1 THE RELATIONSHIP BETWEEN HUMANITY AND PLANTS H... 2 DUE TO THEIR VERSATILE METABOLITES, THE USE OF... 3 FILAMENTOUS FUNGI ARE CONSIDERED GOOD SOURCES ... 4 MALARIA IS ONE OF THE MOST FATAL DISEASES AFFE... Name: X, dtype: object
In [74]:
data_shuffled = data.sample(frac=1, random_state=42).reset_index(drop=True).dropna() # frac=1 means shuffle all data
In [75]:
train_size = int(0.8 * len(data_shuffled))
In [91]:
df_train_val = data_shuffled[:train_size] # Training set (80%)
df_test = data_shuffled[train_size:]
In [92]:
df_train= df_train_val[:train_size]
df_val = df_train_val[train_size:]
In [78]:
# X_train_val, X_test, y_train_val, y_test = train_test_split(df['X'], df['Y'], test_size=0.2, random_state=42)
# x_train, x_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.2, random_state=42)
In [79]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32
seed = 42
In [93]:
X = df_train['X'].str.replace(r'[^A-Za-z0-9\s]+', '', regex=True)
Y = df_train['Y'].values
df_val = df_val['X'].str.replace(r'[^A-Za-z0-9\s]+', '', regex=True)
df_test = df_test['X'].str.replace(r'[^A-Za-z0-9\s]+', '', regex=True)
In [82]:
vectorizer = tf.keras.layers.TextVectorization(max_tokens=30000, output_sequence_length=100)
In [83]:
vectorizer.adapt(X)
In [84]:
train_ds = tf.data.Dataset.from_tensor_slices((X, Y))
In [86]:
def vectorize_text(text, label):
text = tf.expand_dims(text, -1) # Expand dims to add an extra dimension for the vectorizer
return vectorizer(text), label
In [87]:
dataset = tf.data.Dataset.from_tensor_slices((X, Y))
In [88]:
vectorized_ds = dataset.map(vectorize_text)
In [89]:
vectorized_ds = vectorized_ds.batch(32).prefetch(buffer_size=tf.data.AUTOTUNE)
In [96]:
for text_batch, label_batch in vectorized_ds.take(1):
for i in range(1):
print(f"Abstract: {text_batch.numpy()[1]}")
label=label_batch.numpy()[i]
print(f"label: {label}")
Abstract: [[ 2 4350 1385 515 16 2 94 3624 4920 14 1153 109
2 2453 3 2253 15354 1487 6970 2964 8 2 1 1
4 4912 3806 16 2 2376 11308 4920 2 4350 1385 3136
110 11941 1249 555 4 1721 2 555 10566 45 7 2654
2 4350 1385 515 16 2 94 3624 4920 14 1153 109
2 2453 3 2253 15354 1487 6970 2964 8 2 1 1
4 4912 3806 16 2 2376 11308 4920 2 4350 1385 3136
110 11941 1249 555 4 1721 2 555 10566 45 7 12731
8189 3 1603 4]]
label: 40000001
2024-10-21 21:17:00.966064: I tensorflow/core/framework/local_rendezvous.cc:404] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence
In [102]:
#@title ##Escolher um modelo BERT pré-treinado (TensorFlow Hub)
map_name_to_handle = {
'bert_en_uncased_L-12_H-768_A-12':
'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/3',
'bert_en_cased_L-12_H-768_A-12':
'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3',
'bert_multi_cased_L-12_H-768_A-12':
'https://tfhub.dev/tensorflow/bert_multi_cased_L-12_H-768_A-12/3',
'small_bert/bert_en_uncased_L-2_H-128_A-2':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-128_A-2/1',
'small_bert/bert_en_uncased_L-2_H-256_A-4':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-256_A-4/1',
'small_bert/bert_en_uncased_L-2_H-512_A-8':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-512_A-8/1',
'small_bert/bert_en_uncased_L-2_H-768_A-12':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1',
'small_bert/bert_en_uncased_L-4_H-128_A-2':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-128_A-2/1',
'small_bert/bert_en_uncased_L-4_H-256_A-4':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-256_A-4/1',
'small_bert/bert_en_uncased_L-4_H-512_A-8':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1',
'small_bert/bert_en_uncased_L-4_H-768_A-12':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-768_A-12/1',
'small_bert/bert_en_uncased_L-6_H-128_A-2':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-128_A-2/1',
'small_bert/bert_en_uncased_L-6_H-256_A-4':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-256_A-4/1',
'small_bert/bert_en_uncased_L-6_H-512_A-8':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-512_A-8/1',
'small_bert/bert_en_uncased_L-6_H-768_A-12':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-6_H-768_A-12/1',
'small_bert/bert_en_uncased_L-8_H-128_A-2':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-128_A-2/1',
'small_bert/bert_en_uncased_L-8_H-256_A-4':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-256_A-4/1',
'small_bert/bert_en_uncased_L-8_H-512_A-8':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-512_A-8/1',
'small_bert/bert_en_uncased_L-8_H-768_A-12':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-8_H-768_A-12/1',
'small_bert/bert_en_uncased_L-10_H-128_A-2':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-128_A-2/1',
'small_bert/bert_en_uncased_L-10_H-256_A-4':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-256_A-4/1',
'small_bert/bert_en_uncased_L-10_H-512_A-8':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-512_A-8/1',
'small_bert/bert_en_uncased_L-10_H-768_A-12':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-10_H-768_A-12/1',
'small_bert/bert_en_uncased_L-12_H-128_A-2':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-128_A-2/1',
'small_bert/bert_en_uncased_L-12_H-256_A-4':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1',
'small_bert/bert_en_uncased_L-12_H-512_A-8':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-512_A-8/1',
'small_bert/bert_en_uncased_L-12_H-768_A-12':
'https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-768_A-12/1',
'albert_en_base':
'https://tfhub.dev/tensorflow/albert_en_base/2',
'electra_small':
'https://tfhub.dev/google/electra_small/2',
'electra_base':
'https://tfhub.dev/google/electra_base/2',
'experts_pubmed':
'https://tfhub.dev/google/experts/bert/pubmed/2',
'experts_wiki_books':
'https://tfhub.dev/google/experts/bert/wiki_books/2',
'talking-heads_base':
'https://tfhub.dev/tensorflow/talkheads_ggelu_bert_en_base/1',
}
map_model_to_preprocess = {
'bert_en_uncased_L-12_H-768_A-12':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'bert_en_cased_L-12_H-768_A-12':
'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3',
'small_bert/bert_en_uncased_L-2_H-128_A-2':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-2_H-256_A-4':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-2_H-512_A-8':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-2_H-768_A-12':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-4_H-128_A-2':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-4_H-256_A-4':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-4_H-512_A-8':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-4_H-768_A-12':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-6_H-128_A-2':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-6_H-256_A-4':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-6_H-512_A-8':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-6_H-768_A-12':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-8_H-128_A-2':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-8_H-256_A-4':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-8_H-512_A-8':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-8_H-768_A-12':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-10_H-128_A-2':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-10_H-256_A-4':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-10_H-512_A-8':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-10_H-768_A-12':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-12_H-128_A-2':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-12_H-256_A-4':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-12_H-512_A-8':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'small_bert/bert_en_uncased_L-12_H-768_A-12':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'bert_multi_cased_L-12_H-768_A-12':
'https://tfhub.dev/tensorflow/bert_multi_cased_preprocess/3',
'albert_en_base':
'https://tfhub.dev/tensorflow/albert_en_preprocess/3',
'electra_small':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'electra_base':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'experts_pubmed':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'experts_wiki_books':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
'talking-heads_base':
'https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3',
}
bert_model_name = 'small_bert/bert_en_uncased_L-4_H-512_A-8'
#bert_model_name = 'small_bert/bert_en_uncased_L-2_H-128_A-2'
tfhub_handle_encoder = map_name_to_handle[bert_model_name]
tfhub_handle_preprocess = map_model_to_preprocess[bert_model_name]
print(f'modelo BERT selecionado : {tfhub_handle_encoder}')
print(f'Modelo de pré-processamento auto-selecionado: {tfhub_handle_preprocess}')
modelo BERT selecionado : https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1 Modelo de pré-processamento auto-selecionado: https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3
In [105]:
bert_preprocess_model = hub.KerasLayer(tfhub_handle_preprocess)
In [ ]:
In [ ]: